# install.packages("ggplot2","dplyr")
library(ggplot2)
library(dplyr)
library(tidyr)

Reading in corpus

# R must be at least 3.3.1 for `tm` and `slam` to work.
# install.packages("tm")
# install.packages("SnowballC")
library(tm)
Loading required package: NLP

Attaching package: ‘NLP’

The following object is masked from ‘package:ggplot2’:

    annotate
#system("ls ../input") # do we need this?

Adapted from this Kaggle notebook.

shak<-read.csv("../data/Shakespeare_data.csv",header = TRUE, as.is = TRUE)
#shak<-na.omit(shak)
head(shak)
  Dataline     Play PlayerLinenumber ActSceneLine        Player
1        1 Henry IV               NA                           
2        2 Henry IV               NA                           
3        3 Henry IV               NA                           
4        4 Henry IV                1        1.1.1 KING HENRY IV
5        5 Henry IV                1        1.1.2 KING HENRY IV
6        6 Henry IV                1        1.1.3 KING HENRY IV
                                                                                        PlayerLine
1                                                                                            ACT I
2                                                                     SCENE I. London. The palace.
3 Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others
4                                                           So shaken as we are, so wan with care,
5                                                       Find we a time for frighted peace to pant,
6                                                   And breathe short-winded accents of new broils

Word frequency

# play level word frequency
plays <- unique(shak$Play)
loveFreq<-numeric()
for (i in 1:length(plays)){
    text <- Corpus(VectorSource(paste(shak[shak$Play==plays[i],]$PlayerLine,collapse=" ")))
    text <- tm_map(text, removePunctuation)
    text <- tm_map(text, PlainTextDocument)
    text <- tm_map(text, removeWords, stopwords('english'))
    
    # stemming to merge all "loved", "loving" into one   
    text <- tm_map(text, stemDocument)
    tdm  <- TermDocumentMatrix(text)
    
    loveFreq[i]<-as.numeric(slam::row_sums(tdm)["love"])
  }
lPlay <- data.frame(plays,loveFreq)
lPlay <- na.omit(lPlay)
# order the plays based on the occurence of love
lPlay<-lPlay[order(-lPlay$loveFreq),]
head(lPlay)
                      plays loveFreq
35  Two Gentlemen of Verona      188
28         Romeo and Juliet      160
6            As you like it      138
22 A Midsummer nights dream      128
17       Loves Labours Lost      125
23   Much Ado about nothing      122
# player level word frequency
players <- unique(shak$Player)
loveFreq <- numeric()
for (i in 1:length(players)){
    text <- Corpus(VectorSource(paste(shak[shak$Player==players[i],]$PlayerLine,collapse=" ")))
    text <- tm_map(text, removePunctuation)
    text <- tm_map(text, PlainTextDocument)
    text <- tm_map(text, removeWords, stopwords('english'))
    text <- tm_map(text,stemDocument)
    
    tdm  <- TermDocumentMatrix(text)
    
    loveFreq[i] <- as.numeric(slam::row_sums(tdm)["love"])
  }
lPlayer <- data.frame(players,loveFreq)
lPlayer <- na.omit(lPlayer)
#order
lPlayer <- lPlayer[order(-lPlayer$loveFreq),]
head(lPlayer)
     players loveFreq
904  PROTEUS       59
190 ROSALIND       57
771    ROMEO       56
169   HELENA       46
906    JULIA       41
650     IAGO       40

Visualising corpus

shak %>%
  group_by(Play) %>%
  summarise(n = n()) %>%
  ggplot(., aes(x=reorder(Play, n),y=n)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("Length of Shakespeare's plays") +
    theme(legend.position="none") +
    xlab("Play") +
    ylab("Number of lines")

shak %>%
  filter(Play == "Hamlet") %>%
  group_by(Player) %>%
  summarise(n = n()) %>%
  ggplot(., aes(x=reorder(Player, n),y=n)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("Speech in Hamlet") +
    theme(legend.position="none") +
    xlab("Player") +
    ylab("Number of lines")

shak %>%
  group_by(Play,Player) %>%
  summarise(n = n()) %>%
  filter(n > 700) %>%
  ggplot(., aes(x=reorder(Player, n),y=n)) +
    geom_bar(aes(fill=Play),stat="identity") +
    coord_flip() +
    ggtitle("Amount of lines by character") +
#    theme(legend.position="none") +
    xlab("Player") +
    ylab("Number of lines")

lPlay %>%
  ggplot(., aes(x=reorder(plays, loveFreq),y=loveFreq)) +
    geom_bar(aes(fill=plays),stat="identity") +
    coord_flip() +
    ggtitle("Love in each play") +
#    theme(legend.position="none") +
    xlab("Play") +
    ylab("frequency of the word 'love'") +
    theme(legend.position = "none")

lPlayer %>%
  filter(loveFreq > 20) %>%
  ggplot(., aes(x=reorder(players, loveFreq),y=loveFreq)) +
    geom_bar(aes(fill=players),stat="identity") +
    coord_flip() +
    ggtitle("Love in each play") +
#    theme(legend.position="none") +
    xlab("Play") +
    ylab("frequency of the word 'love'") +
    theme(legend.position = "none")

Attempting n-grams

library(dplyr)
#install.packages("tidytext")
library(tidytext)
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word)
# A tibble: 820,204 x 6
   Dataline Play     PlayerLinenumber ActSceneLine Player word  
      <int> <chr>               <int> <chr>        <chr>  <chr> 
 1        1 Henry IV               NA ""           ""     act   
 2        1 Henry IV               NA ""           ""     i     
 3        2 Henry IV               NA ""           ""     scene 
 4        2 Henry IV               NA ""           ""     i     
 5        2 Henry IV               NA ""           ""     london
 6        2 Henry IV               NA ""           ""     the   
 7        2 Henry IV               NA ""           ""     palace
 8        3 Henry IV               NA ""           ""     enter 
 9        3 Henry IV               NA ""           ""     king  
10        3 Henry IV               NA ""           ""     henry 
# ... with 820,194 more rows
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  count(word, sort = TRUE)
# A tibble: 24,749 x 2
   word      n
   <chr> <int>
 1 the   27052
 2 and   25082
 3 i     20142
 4 to    18984
 5 of    15862
 6 a     14196
 7 you   13347
 8 my    11875
 9 in    10540
10 that  10441
# ... with 24,739 more rows
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
Joining, by = "word"
# A tibble: 24,148 x 2
   word      n
   <chr> <int>
 1 thou   5193
 2 thy    3727
 3 thee   3024
 4 lord   2621
 5 sir    2454
 6 enter  2338
 7 love   1927
 8 hath   1845
 9 king   1500
10 tis    1384
# ... with 24,138 more rows
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  filter(n>800) %>%
  ggplot(., aes(x=reorder(word,n),y=n)) +
    geom_bar(stat="identity") +
    coord_flip()
Joining, by = "word"

Using tibbles

How can we organise this so that we can compare across plays?

shak[,c(2,5,6)] %>%
  as_tibble() %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  filter(word=="love" | word =="king" | word=="death" | word=="sweet") %>%
  #add_count(Player) %>%
  group_by(Player,Play,word) %>%
  summarise(n=n()) %>%
  #anti_join(stop_words) %>%
  filter(  Play == "Hamlet" | 
           Play == "King Lear" | 
           Play == "A Midsummer nights dream" | 
           Play == "Othello" | 
           Play == "Henry V" | 
           Play == "Romeo and Juliet") %>%
  arrange(desc(n)) %>%
  ggplot(., aes(x=word,y=n)) +
    geom_bar(aes(fill=word),stat="identity") +
#    coord_flip() +
    facet_wrap(~Play)

Is there a way to break it down to see who is saying what?

What about n-grams

word <- c(NA,"thou","thee","thy","thine","dost","shalt","wilt","hast","hath","scene","tis","ii","iii","iv","v","vi","vii")
lexicon <- rep("shakespeare",length(word))
new_stop <- cbind(word,lexicon)
shak_stop <- rbind(new_stop,stop_words)
shak %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE)
# A tibble: 57,371 x 3
   word1    word2       n
   <chr>    <chr>   <int>
 1 enter    king      101
 2 mine     eyes       95
 3 king     henry      88
 4 sir      john       80
 5 mark     antony     76
 6 mine     honour     71
 7 king     richard    51
 8 god      save       48
 9 gracious lord       46
10 noble    lord       46
# ... with 57,361 more rows

Networks

#install.packages("igraph")
#install.packages("ggraph")
library(igraph)
library(ggraph)
library(grid)

Bigrams

set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
p1 <- shak %>%
  filter(Play=="Hamlet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()
set.seed(814)
p2 <- shak %>%
  filter(Play == "Twelfth Night") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkred", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "salmon", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()
set.seed(814)
p3 <- shak %>%
  filter(Play == "Romeo and Juliet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkgreen", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "green2", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()
multiplot(p1,p2,p3,cols=3)

set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
p1 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play=="Hamlet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Hamlet")
set.seed(814)
p2 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Twelfth Night") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkred", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "salmon", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Twelfth Night")
set.seed(814)
p3 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Romeo and Juliet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkgreen", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "green2", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Romeo and Juliet")
set.seed(814)
p4 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Othello") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkorange", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "orange", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Othello")
set.seed(814)
p5 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Henry IV") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="cadetblue4", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "cyan", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Henry IV")
set.seed(814)
p6 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "The Tempest") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="violet", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "magenta", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("The Tempest")
multiplot(p1,p2,p3,p4,p5,p6,cols=3)

Trigrams

This should give us a better idea of slightly looser connections

set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
shak %>%
  as_tibble() %>%
  filter(ActSceneLine != "") %>%
  unnest_tokens(input = PlayerLine, output = trigram, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  filter(!word3 %in% shak_stop$word) %>% # filters stop words from third column
  count(word1, word2, word3, sort = TRUE) %>%
  filter(n > 2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()

What happens if we treat the first pair and second pair of trigrams as separate bigrams and graph them as before?

Heatmaps

shak %>%
  as_tibble() %>%
  #filter(Play == "Hamlet" | Play == "Loves Labours Lost" | Play == "A Winters Tale") %>%
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Play,act,scene, sort=TRUE) %>%
  transmute(play=Play, act=as.numeric(act), scene=as.numeric(scene), n=n)
# A tibble: 737 x 4
   play                  act scene     n
   <chr>               <dbl> <dbl> <int>
 1 Loves Labours Lost      5     2   972
 2 A Winters Tale          4     4   929
 3 Hamlet                  2     2   616
 4 King John               2     1   609
 5 The Tempest             1     2   596
 6 Cymbeline               5     5   584
 7 Measure for measure     5     1   580
 8 Timon of Athens         4     3   577
 9 Richard III             4     4   561
10 A Winters Tale          1     2   539
# ... with 727 more rows

Wrap up

What this all seems to tell us is that we can visualise the structure of the play, separate from their content. Is this useful to you?

---
title: "Processing Shakespeare"
output:
  html_notebook:
    toc: yes
    toc_depth: 2
    toc_float: yes
---

```{r echo=FALSE,message=FALSE}
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
```


```{r,message=FALSE}
# install.packages("ggplot2","dplyr")
library(ggplot2)
library(dplyr)
library(tidyr)
```


# Reading in corpus

```{r}
# R must be at least 3.3.1 for `tm` and `slam` to work.
# install.packages("tm")
# install.packages("SnowballC")
library(tm)
#system("ls ../input") # do we need this?
```


Adapted from [this Kaggle notebook](https://www.kaggle.com/sindhuee/love-in-shakespeare?scriptVersionId=1121270).

```{r}
shak<-read.csv("../data/Shakespeare_data.csv",header = TRUE, as.is = TRUE)
#shak<-na.omit(shak)
head(shak)
```


# Word frequency

```{r}
# play level word frequency
plays <- unique(shak$Play)
loveFreq<-numeric()

for (i in 1:length(plays)){
    text <- Corpus(VectorSource(paste(shak[shak$Play==plays[i],]$PlayerLine,collapse=" ")))
    text <- tm_map(text, removePunctuation)
    text <- tm_map(text, PlainTextDocument)
    text <- tm_map(text, removeWords, stopwords('english'))
    
    # stemming to merge all "loved", "loving" into one   
    text <- tm_map(text, stemDocument)
    tdm  <- TermDocumentMatrix(text)
    
    loveFreq[i]<-as.numeric(slam::row_sums(tdm)["love"])
  }

lPlay <- data.frame(plays,loveFreq)
lPlay <- na.omit(lPlay)

# order the plays based on the occurence of love
lPlay<-lPlay[order(-lPlay$loveFreq),]
head(lPlay)
```

```{r}
# player level word frequency
players <- unique(shak$Player)
loveFreq <- numeric()

for (i in 1:length(players)){
    text <- Corpus(VectorSource(paste(shak[shak$Player==players[i],]$PlayerLine,collapse=" ")))
    text <- tm_map(text, removePunctuation)
    text <- tm_map(text, PlainTextDocument)
    text <- tm_map(text, removeWords, stopwords('english'))
    text <- tm_map(text,stemDocument)
    
    tdm  <- TermDocumentMatrix(text)
    
    loveFreq[i] <- as.numeric(slam::row_sums(tdm)["love"])
  }

lPlayer <- data.frame(players,loveFreq)
lPlayer <- na.omit(lPlayer)
#order
lPlayer <- lPlayer[order(-lPlayer$loveFreq),]

head(lPlayer)
```

# Visualising corpus

```{r}
shak %>%
  group_by(Play) %>%
  summarise(n = n()) %>%
  ggplot(., aes(x=reorder(Play, n),y=n)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("Length of Shakespeare's plays") +
    theme(legend.position="none") +
    xlab("Play") +
    ylab("Number of lines")
```

```{r}
shak %>%
  filter(Play == "Hamlet") %>%
  group_by(Player) %>%
  summarise(n = n()) %>%
  ggplot(., aes(x=reorder(Player, n),y=n)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("Speech in Hamlet") +
    theme(legend.position="none") +
    xlab("Player") +
    ylab("Number of lines")
```

```{r}
shak %>%
  group_by(Play,Player) %>%
  summarise(n = n()) %>%
  filter(n > 700) %>%
  ggplot(., aes(x=reorder(Player, n),y=n)) +
    geom_bar(aes(fill=Play),stat="identity") +
    coord_flip() +
    ggtitle("Amount of lines by character") +
#    theme(legend.position="none") +
    xlab("Player") +
    ylab("Number of lines")
```

```{r}
lPlay %>%
  ggplot(., aes(x=reorder(plays, loveFreq),y=loveFreq)) +
    geom_bar(aes(fill=plays),stat="identity") +
    coord_flip() +
    ggtitle("Love in each play") +
#    theme(legend.position="none") +
    xlab("Play") +
    ylab("frequency of the word 'love'") +
    theme(legend.position = "none")
```

```{r}
lPlayer %>%
  filter(loveFreq > 20) %>%
  ggplot(., aes(x=reorder(players, loveFreq),y=loveFreq)) +
    geom_bar(aes(fill=players),stat="identity") +
    coord_flip() +
    ggtitle("Love in each play") +
#    theme(legend.position="none") +
    xlab("Play") +
    ylab("frequency of the word 'love'") +
    theme(legend.position = "none")
```

# Attempting n-grams

```{r}
library(dplyr)
#install.packages("tidytext")
library(tidytext)
```


```{r}
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word)
```

```{r}
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  count(word, sort = TRUE)
```

```{r}
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
```

```{r}
shak %>%
  as_tibble(.) %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  filter(n>800) %>%
  ggplot(., aes(x=reorder(word,n),y=n)) +
    geom_bar(stat="identity") +
    coord_flip()
```

## Using tibbles

How can we organise this so that we can compare across plays?

```{r}
shak[,c(2,5,6)] %>%
  as_tibble() %>%
  unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
  filter(word=="love" | word =="king" | word=="death" | word=="sweet") %>%
  #add_count(Player) %>%
  group_by(Player,Play,word) %>%
  summarise(n=n()) %>%
  #anti_join(stop_words) %>%
  filter(  Play == "Hamlet" | 
           Play == "King Lear" | 
           Play == "A Midsummer nights dream" | 
           Play == "Othello" | 
           Play == "Henry V" | 
           Play == "Romeo and Juliet") %>%
  arrange(desc(n)) %>%
  ggplot(., aes(x=word,y=n)) +
    geom_bar(aes(fill=word),stat="identity") +
#    coord_flip() +
    facet_wrap(~Play)
```

Is there a way to break it down to see who is saying what?

## What about n-grams

```{r fig.width=5, fig.asp=.5}

shak %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  #anti_join(stop_words) %>%
  filter(bigram=="my lord" | bigram =="my lady" | bigram=="my mother" | bigram=="my father" | bigram=="my wife" | bigram=="my husband") %>%
  mutate(gender = bigram) %>%
  mutate(gender = recode_factor(gender,
                `my lord`="masc",
                `my father`="masc",
                `my husband`="masc",
                `my lady`="fem",
                `my mother`="fem",
                `my wife`="fem")) %>%
  group_by(Player,Play,bigram,gender) %>%
  summarise(n=n()) %>%
  mutate(bigramFac = factor(bigram, levels=c("my lord", "my husband", "my father", "my lady", "my wife", "my mother"))) %>%
  # too boring
  filter(  Play != "Henry VI Part 1" &
           Play != "Henry VI Part 2" &
           Play != "Henry VI Part 3" &
           Play != "Pericles" & 
           Play != "Timon of Athens" & 
           Play != "The Tempest") %>%
  # too skewed
  filter(  Play != "Hamlet" &
           Play != "Troilus and Cressida" &
           Play != "Richard III" &
           Play != "Titus Andronicus" & 
           Play != "Henry VIII" & 
           Play != "Much Ado about nothing") %>%
  arrange(desc(n)) %>%
  ggplot(., aes(x=bigramFac,y=n)) +
    geom_bar(aes(fill=gender),stat="identity")+#,position="dodge") +
    scale_y_log10() +
    coord_flip() +
    facet_wrap(~Play,nrow=3)
```

```{r}
shak %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = ngram, token = "ngrams", n = 2) %>%
  mutate(bigram = ngram) %>%
  unnest_tokens(input = ngram, output = word) %>%
  #anti_join(stop_words) %>%
  filter(word=="king" | word=="queen") %>%
  group_by(Play,bigram,word) %>%
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  ggplot(aes(x=reorder(Play,n))) +
    geom_bar(aes(fill=word),stat="count",position="dodge") +
    #scale_y_log10() +
    coord_flip()
```


```{r}
word <- c(NA,"thou","thee","thy","thine","dost","shalt","wilt","hast","hath","scene","tis","ii","iii","iv","v","vi","vii")
lexicon <- rep("shakespeare",length(word))
new_stop <- cbind(word,lexicon)
shak_stop <- rbind(new_stop,stop_words)
```



```{r}
shak %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE)
```

## Networks

```{r}
#install.packages("igraph")
#install.packages("ggraph")
library(igraph)
library(ggraph)
library(grid)
```

### Bigrams

```{r}
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
shak %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 22) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()
```



```{r}
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
p1 <- shak %>%
  filter(Play=="Hamlet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()

set.seed(814)
p2 <- shak %>%
  filter(Play == "Twelfth Night") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkred", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "salmon", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()

set.seed(814)
p3 <- shak %>%
  filter(Play == "Romeo and Juliet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkgreen", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "green2", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()

multiplot(p1,p2,p3,cols=3)
```

```{r fig.width=5, fig.asp=.5}
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
p1 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play=="Hamlet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Hamlet")

set.seed(814)
p2 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Twelfth Night") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkred", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "salmon", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Twelfth Night")

set.seed(814)
p3 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Romeo and Juliet") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkgreen", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "green2", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Romeo and Juliet")

set.seed(814)
p4 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Othello") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkorange", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "orange", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Othello")

set.seed(814)
p5 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "Henry IV") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="cadetblue4", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "cyan", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("Henry IV")

set.seed(814)
p6 <- shak %>%
  filter(ActSceneLine != "") %>%
  filter(Play == "The Tempest") %>%
  as_tibble() %>%
  unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  count(word1, word2, sort = TRUE) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="violet", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "magenta", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void() +
    ggtitle("The Tempest")

multiplot(p1,p2,p3,p4,p5,p6,cols=3)
```

### Trigrams

This should give us a better idea of slightly looser connections

```{r fig.width=5, fig.asp=.5}
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
shak %>%
  as_tibble() %>%
  filter(ActSceneLine != "") %>%
  unnest_tokens(input = PlayerLine, output = trigram, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  filter(!word3 %in% shak_stop$word) %>% # filters stop words from third column
  count(word1, word2, word3, sort = TRUE) %>%
  filter(n > 2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()
```


What happens if we treat the first pair and second pair of trigrams as separate bigrams and graph them as before?

```{r fig.width=5, fig.asp=.75}
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.075, "inches"))
w1w2 <- shak %>%
  as_tibble() %>%
  filter(ActSceneLine != "") %>%
  unnest_tokens(input = PlayerLine, output = trigram, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  filter(!word3 %in% shak_stop$word) %>% # filters stop words from third column
  count(word1, word2, word3, sort = TRUE) %>%
  mutate(set = 1) %>%
  transmute(word1=word1,word2=word2,n=n,set=set)
w2w3 <- shak %>%
  as_tibble() %>%
  filter(ActSceneLine != "") %>%
  unnest_tokens(input = PlayerLine, output = trigram, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% # separates bigram into two columns, one for each word
  filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
  filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
  filter(!word3 %in% shak_stop$word) %>% # filters stop words from third column
  count(word1, word2, word3, sort = TRUE) %>%
  mutate(set = 2) %>%
  transmute(word1=word2,word2=word3,n=n,set=set)
wXwY <- bind_rows(w1w2,w2w3)

wXwY %>%
  filter(n>=3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
    geom_node_point(color = "lightblue", size = 5) +
    geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = TRUE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_text(aes(label = name), alpha=.75, repel=TRUE) + # , vjust = 1, hjust = 1) +
    theme_void()
```

# Heatmaps

```{r}
shak %>%
  as_tibble() %>%
  #filter(Play == "Hamlet" | Play == "Loves Labours Lost" | Play == "A Winters Tale") %>%
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Play,act,scene, sort=TRUE) %>%
  transmute(play=Play, act=as.numeric(act), scene=as.numeric(scene), n=n)
```


```{r}
shak %>%
  as_tibble() %>%
  filter(Play == "Hamlet" | Play == "King John" | Play == "The Tempest" | 
           Play == "Cymbeline" | Play == "Measure for measure" | Play == "Timon of Athens" | 
           Play == "Richard III" | Play == "Loves Labours Lost" | Play == "A Winters Tale" | 
           Play == "Othello" | Play == "Romeo and Juliet" | Play == "Henry V") %>% 
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Play,act, sort=TRUE) %>%
  transmute(play=Play, act=as.integer(act), n=n) %>%
  ggplot(aes(x=act,y=reorder(play, n))) + 
    geom_tile(aes(fill = n), colour = "white") + scale_fill_gradient(low = "white", high = "steelblue")
```

```{r}
shak %>%
  as_tibble() %>%
  filter(Play == "Hamlet" | Play == "King John" | Play == "The Tempest" | 
           Play == "Cymbeline" | Play == "Measure for measure" | Play == "Timon of Athens" | 
           Play == "Richard III" | Play == "Loves Labours Lost" | Play == "A Winters Tale" | 
           Play == "Othello" | Play == "Romeo and Juliet" | Play == "Henry V") %>% 
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Play,scene, sort=TRUE) %>%
  transmute(play=Play, scene=as.integer(scene), n=n) %>%
  ggplot(aes(x=scene,y=reorder(play, n))) + 
    geom_tile(aes(fill = n), colour = "white") + scale_fill_gradient(low = "white", high = "steelblue") +
    scale_x_continuous(breaks=c(0:8))
```

```{r fig.width=5, fig.asp=.35}
shak %>%
  as_tibble() %>%
  filter(Play == "Hamlet" | Play == "King John" | Play == "The Tempest" | 
           Play == "Cymbeline" | Play == "Measure for measure" | Play == "Timon of Athens" | 
           Play == "Richard III" | Play == "Loves Labours Lost" | Play == "A Winters Tale" | 
           Play == "Othello" | Play == "Romeo and Juliet") %>% 
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Play,act,scene, sort=TRUE) %>%
  transmute(play=Play, act=as.integer(act), scene=as.integer(scene), n=n) %>%
  ggplot(aes(x=scene,y=play)) + 
    geom_tile(aes(fill = n), colour = "white") + 
    scale_fill_gradient(low = "white", high = "red2") +
    scale_x_continuous(breaks=c(0:8)) +
    theme_dark() + 
    facet_wrap(~act, ncol = 5)
```


```{r fig.width=5, fig.asp=.5}
shak %>%
  as_tibble() %>%
  filter(Play == "Hamlet") %>% 
  #filter(Player != "HAMLET" & Player != "LORD POLONIUS" & Player != "KING CLAUDIUS") %>% 
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Player,act,scene, sort=TRUE) %>%
  transmute(player=Player, act=as.integer(act), scene=as.integer(scene), n=n) %>%
  ggplot(aes(x=scene,y=reorder(player,n))) + 
    geom_tile(aes(fill = n), colour = "white") + 
    scale_fill_gradient(low = "white", high = "red2") +
    scale_x_continuous(breaks=c(0:8)) +
    theme_dark() + 
    facet_wrap(~act, ncol = 5)
```

```{r fig.width=5, fig.asp=.5}
shak %>%
  as_tibble() %>%
  filter(Play == "Twelfth Night") %>% 
  #filter(Player != "HAMLET" & Player != "LORD POLONIUS" & Player != "KING CLAUDIUS") %>% 
  filter(ActSceneLine != "") %>%
  mutate(ActSceneLine2 = ActSceneLine) %>%
  separate(ActSceneLine2, c("act", "scene", "line")) %>%
  count(Player,act,scene, sort=TRUE) %>%
  transmute(player=Player, act=as.integer(act), scene=as.integer(scene), n=n) %>%
  ggplot(aes(x=scene,y=player)) + 
    geom_tile(aes(fill = n), colour = "white") + 
    scale_fill_gradient(low = "white", high = "red2") +
    scale_x_continuous(breaks=c(0:8)) +
    theme_dark() + 
    facet_wrap(~act, ncol = 5)
```

## Wrap up

What this all seems to tell us is that we can visualise the structure of the play, separate from their content. Is this useful to you?